# 05_calc_pha_dist_cent.R
# Calculate pairwise distances between preference system graphs 
# Assemble separate graph for each system and then
# measure distance between these graphs based on 
# differences in PageRank
# Includes PHAs with no preferences as graphs with 
# no edges. 
# Produces:
# - dist_pagerank_imp_XX_exp_XX_damp_XX.RDS: distances between prioritization
#   policy graphs based on centrality distance 

library(dplyr)
library(tidyr)
library(purrr)
library(igraph)
library(here)

options(scipen=999)

# Uncomment to retrieve arguments if calling from external script
# args <- commandArgs(trailingOnly = TRUE)

# set_implicit_weight <- as.numeric(args[1])
# set_explicit_weight <- as.numeric(args[2])
# set_damp_factor <- as.numeric(args[3])

set_implicit_weight <- .25
set_explicit_weight <- 1
set_damp_factor <- .85

print(paste0("Implicit weight = ", set_implicit_weight, ", 
             Explicit weight = ", set_explicit_weight, 
             ", Damping factor = ", set_damp_factor))

# Load data on PHAs in sample
pha_pref_ref <- readRDS(here("data", "pha_pref_ref.RDS"))
# Load coded preference data
codes <- readRDS(here("data", "codes_expanded.RDS"))


# Load all edges generated in 03_part_i_hierarchy.R
all_edges <- readRDS(here("data", "intermediate", "all_edges.RDS"))

#################################
## Preliminary data processing ##
#################################

# Create df with all PHAs in sample and their 
# preferences, including all PHAs with no preferences

# First identify observations from Abt to append
abt_rows <- pha_pref_ref %>%
  filter(any_preferences == 0 & from_abt == TRUE) %>%
  select(pha_code)

coded_prefs <- codes %>%
  dplyr::select(pha_code:special_defs) %>%
  # Add rows for Abt entries
  bind_rows(abt_rows) %>%
  # Zero out rows with no preferences from Abt survey
  mutate_at(vars(unit_single:special_defs), list(~ifelse(is.na(.), 0, .))) 

#######################################################
## Create a list of adjacency matrixes, one for each ##
## preference system                                 ##
#######################################################
# Adjacency matrix setup: each matrix contains all nodes
# Draw implicit and explicit edges 
# This follows the same procedure as implemented in 03_part_i_hierarchy.R
# The only difference is that a separate df is assembled 
# for each PHA 

# Set the downweight for implicit edges
implicit_weight <- set_implicit_weight 
explicit_weight <- set_explicit_weight 

# String for filename suffix to be used when saving distance matrixes
fn_suffix <- paste0("_imp_", 100 * implicit_weight, 
                    "_exp_", 100 * explicit_weight, 
                    "_damp_", (100 * set_damp_factor))

# Create dataframe of all possible edges
# This is necessary to complete a complete 
# adjacency matrix for each PHA 
# This also adds in all the PHAs that don't have 
# any preferences
no_pref_phas <- pha_pref_ref %>%
  filter(any_preferences == 0 & in_sample_abt == TRUE) %>%
  select(pha_code)

# Create a reference for all edges possible for all PHAs
all_possible_edges <- all_edges %>%
  select(-implicit_link_base) %>%
  bind_rows(no_pref_phas) %>%
  tidyr::expand(pha_code, category_from, category_to) %>%
  filter(!is.na(category_from) & !is.na(category_to))

# Use all possible edges as a base
# Join with actual observed edges 
# to form adjacency matrices that 
# contain all categories as nodes 

adj_downweight_edges <- all_possible_edges %>%
  ungroup() %>%
  left_join(all_edges, by = c("pha_code", "category_from", "category_to")) %>%
  mutate(weight = case_when(is.na(implicit_link_base) ~ 0,
                            implicit_link_base == TRUE ~ implicit_weight, 
                            implicit_link_base == FALSE ~ explicit_weight)) %>%
  group_by(pha_code, category_from, category_to) %>% 
  summarise(edge_weight = sum(weight)) %>%
  ungroup() 

adj_downweight <- adj_downweight_edges %>%
  spread(category_to, edge_weight)

# Create a nested version of the adjacency matrxies 
adj_nested <- adj_downweight %>%
  arrange(pha_code, category_from) %>%
  group_by(pha_code) %>%
  nest()

# Function to create correctly formatted adjacency matrix
get_formatted_adj <- function(adj_df) {
  # Save rownames
  rownames(adj_df) <- adj_df$category_from
  
  # Generate network object
  formatted_adj <- adj_df %>%
    ungroup() %>%
    select(-category_from) %>%
    select(sort(current_vars())) %>%
    as.matrix()
  return(formatted_adj)
} 

# Function to create a network object for a 
# given PHA, given the adjacency matrix 
get_network_obj <- function(adj_df) {
  # Save rownames
  rownames(adj_df) <- adj_df$category_from
  
  # Generate network object
  network_obj <- adj_df %>%
    ungroup() %>%
    select(-category_from) %>%
    data.matrix() %>%
    graph.adjacency(mode = "directed", weighted = TRUE, diag = FALSE)
  return(network_obj)
} 

# Create df of nested network objects
pha_networks <- adj_nested %>%
  mutate(network = map(data, get_network_obj),
         adj = map(data, get_formatted_adj)) %>%
  # Adj_matrix from igraph to generate sparseMatrix
  mutate(adj_matrix = map(network, as_adjacency_matrix, sparse = TRUE))

rm(adj_downweight)
rm(adj_downweight_edges)

#######################
# Centrality distance #
#######################

# Calculate distance based on differences centrality. Modification of
# implementation of nd.centrality in NetworkDistance package
# Allows for the use of alternate measures of centrality
# Helper functions graph_transform and list_transform 
# come directly from the package. 
# see 
# library("NetworkDistance")
# https://github.com/kyoustat/NetworkDistance

graph_transform <- function(obj, NIs="allowed"){
  # 1. package:: igraph
  if (class(obj)=="igraph"){
    output = as_adjacency_matrix(obj)
  # 2. package:: network
  } else if (class(obj)=="network"){
    output = Matrix::Matrix(as.matrix.network(obj, matrix.type = "adjacency"), 
                            sparse=TRUE)
  # 3. simple matrix
  } else {
    output = Matrix::Matrix(obj, sparse=TRUE)
  }
  if (NIs!="allowed"){
    if ((any(is.na(output)))||(any(is.infinite(output)))){
      stop("* NetworkDistance : inputs of NA, Inf, or -Inf are not allowed.")
    }
  }
  return(output)
}

list_transform <- function(A, NIflag="allowed"){
  # 1. size checker
  n = nrow(A[[1]])
  if (ncol(A[[1]])!=n){
    stop("* NetworkDistance : an input list should contain all square matrices.")
  }
  # 2. transform
  listA = list()
  for (i in 1:length(A)){
    tgt = graph_transform(A[[i]], NIs=NIflag)
    if ((nrow(tgt)!=n)||(ncol(tgt)!=n)){
      stop(paste("* NetworkDistance : ",i,
                 "-st/rd/th matrix in the list has non-matching size.",sep=""))
    }
    listA[[i]] = tgt
  }
  return(listA)
}

nd_centrality_custom <- function (A, out.dist = TRUE, mode = c("In-Degree", 
                                                               "Prop-In-Degree", 
                                                               "PageRank",
                                                               "EigenCent"), 
                                  damp_factor = NULL,
                                  directed = FALSE) {
  if ((!is.list(A)) || (length(A) <= 1)) {
    stop("* nd.csd : input 'A' should be a list of length larger than 1.")
  }
  listA = list_transform(A, NIflag = "not")
  N = length(listA)
  M = nrow(listA[[1]])
  if ((!is.logical(out.dist)) || (!is.logical(directed))) {
    stop("* nd.centrality : 'out.dist' and 'directed' should be logical variables.")
  }
  if (missing(mode)) {
    mode = "Degree"
  }
  else {
    mode = match.arg(mode)
  }
  mat_features = array(0, c(N, M))
  mat_dist = array(0, c(N, N))
  for (i in 1:N) {
    if (directed == FALSE) {
      tgt = graph_from_adjacency_matrix(listA[[i]], mode = "undirected")
    }
    else {
      tgt = graph_from_adjacency_matrix(listA[[i]], mode = "directed",
                                        diag = FALSE, weighted = TRUE)
    }
    if (mode == "In-Degree") {
      mat_features[i, ] = as.vector(igraph::degree(tgt, mode = "in"))
    }
    else if (mode == "Prop-In-Degree") {
      deg_in = as.vector(igraph::degree(tgt, mode = "in"))
      deg_total = as.vector(igraph::degree(tgt, mode = "total"))
      mat_features[i, ] = deg_in / deg_total
    }
    else if (mode == "PageRank") {
      # Note this returns the PageRanks in alpha order of node name 
      prs = igraph::page.rank(tgt, directed = TRUE, damping = damp_factor,
                                            weights = E(tgt)$weight)$vector
      mat_features[i, ] = prs
      if (i == 1) {
        # store colnames
        saveRDS(names(prs), here("data", "intermediate", 
                                 paste0("pr_colnames_", fn_suffix, ".RDS")))
      }

    }
    else if (mode == "EigenCent") {
      mat_features[i, ] = igraph::eigen_centrality(tgt, directed = TRUE, 
                                                   scale = TRUE)$vector
    }
  }

  for (i in 1:(N - 1)) {
    vec1 = mat_features[i, ]
    for (j in (i + 1):N) {
      vec2 = mat_features[j, ]
      solution = sum(abs(vec1 - vec2))
      mat_dist[i, j] = solution
      mat_dist[j, i] = solution
    }
  }
  if (out.dist) {
    mat_dist = as.dist(mat_dist)
  }
  result = list()
  result$D = mat_dist
  result$features = mat_features
  return(result)
}

dist_pagerank <- nd_centrality_custom(pha_networks$adj, out.dist = TRUE, 
                                      mode = "PageRank",
                                      damp_factor = set_damp_factor,
                                      directed = TRUE)

# Store 
saveRDS(dist_pagerank, here("data", "intermediate", 
                            paste0("dist_pagerank", fn_suffix, ".RDS")))

